Test CellMixS metrics on Splatter synthetic data

Introduction

We are interested in evaluating integration of scRNA-seq datasets of heterogeneous cell type composition and batch size, which are typical in tumor samples. Here we will test CellMixS batch effect metrics on synthetic datasets generated with splatter We will simulate datasets that can contain one or two of two possible cell types, with different levels of batch effects and relative batch sizes

R environment

Get and load some useful packages

renv::restore()

if (!require("remotes", quietly = TRUE))
    install.packages("remotes")
library(remotes)

if (!require("tidyr", quietly = TRUE))
install.packages("tidyr")

if (!require("scIntegrationMetrics", quietly = TRUE))
install_github("carmonalab/scIntegrationMetrics") #calculates LISI and Silhouette

if (!requireNamespace("BiocManager"))
    install.packages("BiocManager")

if (!requireNamespace("CellMixS"))
  BiocManager::install("CellMixS")

if (!requireNamespace("CellMixS"))
  BiocManager::install("splatter")
library(dplyr)
library(ggplot2)
library(tidyr)
library(scIntegrationMetrics)
library(patchwork)
library(CellMixS)
library(splatter)
library(Seurat)
library(BiocParallel)

seed = 1234
set.seed(seed)
# Load required packages
suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(dplyr)
    library(purrr)
    library(ggplot2)
    library(scater)
})

Prepare extreme heterogeneous datasets (batch effects + different subtype composition) to evaluate metrics

# The next function will generate synthetic datasets, in two possible modes:
# mode twoGroups=T
# will generate 3 batches including two cell types/groups
# Batch 1: contains cell types A and B with equal proportions
# Batch 2: contains only cell type A (cells Group 1)
# Batch 3: contains only cell type B (cells Group 2)
# Batch 1 contains a frequency of `freqBatchAB` of total cells; and the rest are equally devided between Batches 2 and 3.
#
# mode twoGroups=F
# will generate 2 baches with 1 cell type
# then will evaluate a number of metrics from the CellMixS Bioconductor package

simulateAndMeasure <- function(ncellsTotal, freqBatchAB, freqBatchA=NULL, twoGroups=T, de.facLoc=0.2, de.facScale=0.2, batch.facScale=0, batch.facLoc=0, mySeed=123, k=20, cell_min=10, batch_min=NULL, ncores=8){
  
if(is.null(freqBatchA)) {
  freqBatchA <- (1-freqBatchAB)/2
}
  freqBatchB <- 1-freqBatchAB-freqBatchA

if (twoGroups){
  group.prob <- 0.5 # cell type A frequency; avoid group.prob close to 0 or 1
  batchCells <- round(c(freqBatchAB,freqBatchA/group.prob,freqBatchB/(1-group.prob))*ncellsTotal)
  test <- splatSimulate(batchCells = batchCells, group.prob = c(group.prob, 1-group.prob), method = "groups", verbose = FALSE, de.facLoc=de.facLoc, de.facScale=de.facScale, batch.facScale= batch.facScale, batch.facLoc=batch.facLoc ) # see ?SplatParams  de.facLoc=0.2, de.facScale=0.2
  test <- test[,!(test$Batch=="Batch2" & test$Group=="Group1")]
  test <-  test[,!(test$Batch=="Batch3" & test$Group=="Group2")]
} else {
  batchCells <- round(c(freqBatchAB,1-freqBatchAB)*ncellsTotal)
  test <- splatSimulate(batchCells = batchCells, verbose = FALSE, de.facLoc=de.facLoc, de.facScale=de.facScale, batch.facScale= batch.facScale, batch.facLoc=batch.facLoc )
  test$Group <- "Group1"
}
  
set.seed(mySeed)

print(table(test$Group,test$Batch))
test <- logNormCounts(test)
test <- runPCA(test, ncomponents = 2, ntop=500)
#test <- evalIntegration(test, metrics = c("isi", "cms","mixingMetric","entropy"), group = "Batch", k = k, n_dim = 2, cell_min = cell_min, weight = F, BPPARAM=MulticoreParam(workers=ncores))
test <- evalIntegration(test, metrics = c("isi", "cms"), group = "Batch", k = k, n_dim = 2, cell_min = cell_min, weight = F, batch_min = batch_min, BPPARAM=MulticoreParam(workers=ncores))

if(twoGroups){
test <- evalIntegration(test, metrics = c("isi", "cms"), group = "Group", k = k, n_dim = 2, cell_min = cell_min, weight = F, batch_min = batch_min, BPPARAM=MulticoreParam(workers=ncores), res_name = c("cluster_lisi", "cluster_cms"))
}

return(test)
}
# plot pca and metrics
plotSimul <- function(test){
  plotPCA(test, shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test$Batch),":",table(test$Batch))) + coord_fixed() + labs(subtitle =  sprintf("cms %.2f; lisi %.2f; cluster_cms %.2f; cluster_lisi %.2f",mean(test$cms_smooth),mean(test$isi),mean(test$cms_smooth.cluster_cms),mean(test$cluster_lisi)),
) | plotPCA(test, shape_by = "Batch", colour_by = "Group") + coord_fixed()
}

Batch effect magnitude

First, let’s evaluate how the different mixing metrics detect increasing levels of simulated batch effect:

Parameters

k = 20
cell_min = 5
ncores = 8
ncellsTotal = 1000
freqBatchAB = 0.5
test.list.single <- list()
for (batch.factor in seq(0,0.12,by=0.03)) { 
 name <- paste0("batch0_single_batchFactor_",batch.factor)
  test.list.single[[name]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor, batch.facLoc = batch.factor, k=k, cell_min=cell_min, ncores=ncores, twoGroups = F)
}
##         
##          Batch1 Batch2
##   Group1    500    500
##         
##          Batch1 Batch2
##   Group1    500    500
##         
##          Batch1 Batch2
##   Group1    500    500
##         
##          Batch1 Batch2
##   Group1    500    500
##         
##          Batch1 Batch2
##   Group1    500    500
pp <- lapply(names(test.list.single), function(name) { 
  plotPCA(test.list.single[[name]], shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test.list.single[[name]]$Batch),":",table(test.list.single[[name]]$Batch))) + coord_fixed() + labs(subtitle =  sprintf("cms %.2f; lisi %.2f",mean(test.list.single[[name]]$cms_smooth),mean(test.list.single[[name]]$isi))
) & ggtitle(name) & theme(aspect.ratio=1)  | visHist(test.list.single[[name]],metric = c("cms"))
  })
pp
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

pp.out <- wrap_plots(pp, ncol = 2)
ggsave("testMetricsSynthethicData_SingleCellType_BatchFactor.png",pp.out,width = 10,height = 10)
test.list.single.metrics <- data.frame(batchFactor = sub("batch0_single_batchFactor_","",names(test.list.single)),
                                      cms=sapply(test.list.single, function(x) mean(x$cms_smooth)),
                                      lisi= sapply(test.list.single, function(x) mean(x$isi)))
 
test.list.single.metrics %>% pivot_longer(cols=c("cms","lisi")) %>% ggplot(aes(x=batchFactor, y=value, group=name)) + geom_line() + facet_wrap(~ name, scales = "free")

Metrics cms and lisi behave as expected, decreasing with batch effects between the two simulated batches. Mean cms ranges from 0.5 for perfect mixing to 0, no mixing; lisi goes from almost 2 for perfect mixing to 1 for no mixing.

Differential batches abundances

Next, let’s evaluate how the metrics are affected by differential batches abundances

Parameters

ncellsTotal = 1000
test.list.single.balance <- list()
for (myfreqBatchAB in seq(0.2,0.8,by=0.1)) { 
 name <- paste0("batch0_single_balance_",myfreqBatchAB)
  test.list.single.balance[[name]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = myfreqBatchAB, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores, twoGroups = F)
}
##         
##          Batch1 Batch2
##   Group1    200    800
##         
##          Batch1 Batch2
##   Group1    300    700
##         
##          Batch1 Batch2
##   Group1    400    600
##         
##          Batch1 Batch2
##   Group1    500    500
##         
##          Batch1 Batch2
##   Group1    600    400
##         
##          Batch1 Batch2
##   Group1    700    300
##         
##          Batch1 Batch2
##   Group1    800    200
pp <- lapply(names(test.list.single.balance), function(name) { 
  plotPCA(test.list.single.balance[[name]], shape_by = "Group", colour_by = "Batch") + scale_color_discrete(labels = paste(levels(test.list.single.balance[[name]]$Batch),":",table(test.list.single.balance[[name]]$Batch))) + coord_fixed() + labs(subtitle =  sprintf("cms %.2f; lisi %.2f",mean(test.list.single.balance[[name]]$cms_smooth),mean(test.list.single.balance[[name]]$isi))
) & ggtitle(name) & theme(aspect.ratio=1)  | visHist(test.list.single.balance[[name]],metric = c("cms"))
  })
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
pp
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

pp.out <- wrap_plots(pp, ncol = 2)
ggsave("testMetricsSynthethicData_SingleCellType_BatchBalance.png",pp.out,width = 10,height = 10)
test.list.single.balance.metrics.balance <- data.frame(balance = sub("batch0_single_balance_","",names(test.list.single.balance)),
                                      cms=sapply(test.list.single.balance, function(x) mean(x$cms_smooth)),
                                      lisi= sapply(test.list.single.balance, function(x) mean(x$isi)))
 
test.list.single.balance.metrics.balance %>% pivot_longer(cols=c("cms","lisi")) %>% ggplot(aes(x=balance, y=value, group=name)) + geom_line() + facet_wrap(~ name, scales = "free")

Mean cms and lisi peak at equal batch size. Both metrics are strongly affected by batch size unbalance. The larger the unbalance, the lower cms/lisi.

Batches of heterogeneous cell type composition

Now let’s see how the mixing metrics behave when the batches contain different cell types that should not mix, with different levels of batch effect

Parameters

ncellsTotal = 1000
freqBatchAB = 1/3
batch.factor.strong = 0.1
batch.factor.small = 0.06
test.list <- list()
test.list[["batchStrong"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor.strong, batch.facLoc = batch.factor.strong, k=k, cell_min=cell_min, ncores=ncores)
##         
##          Batch1 Batch2 Batch3
##   Group1    177      0    328
##   Group2    156    335      0
plotSimul(test.list[["batchStrong"]])
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Here the large effects are large, and cells are similarly separately by batch and cell type. Because batches do not overlap, lisi is ~1 and cms is ~0. Similarly, cells of one type do not mix with cells of another type, and thus ‘cluster/celltype lisi’ or ‘cluster/celltype cms’ are also ~1 and ~0, respectively.

test.list[["batchMild"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = batch.factor.small, batch.facLoc = batch.factor.small, k=k, cell_min=cell_min, ncores=ncores)
##         
##          Batch1 Batch2 Batch3
##   Group1    171      0    333
##   Group2    162    317      0
plotSimul(test.list[["batchMild"]])
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Here with mild batch effects, cells are largely grouped by cell type, and cells of different batches are more mixed. In this situation, both cms and lisi increase. Cells of different type do not mix with each other, so cluster_cms and cluster_lisi remain at minimum levels

test.list[["batch0"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores)
##         
##          Batch1 Batch2 Batch3
##   Group1    155      0    318
##   Group2    178    315      0
plotSimul(test.list[["batch0"]])
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

Without batch effects, cells of the same type from different batches are perfectly mixed. In this situation, cms and lisi further increase. Cells of different type do not mix with each other, so cluster_cms and cluster_lisi remain at minimum levels

test.list[["batch0_unbalanced"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB/2, batch.facScale = 0, batch.facLoc = 0, k=k, cell_min=cell_min, ncores=ncores)
plotSimul(test.list[["batch0_unbalanced"]])
test.list[["overcorrected"]] <- simulateAndMeasure(ncellsTotal = ncellsTotal, freqBatchAB = freqBatchAB, batch.facScale = 0, batch.facLoc = 0, de.facLoc = 0, de.facScale = 0, k=k, cell_min=cell_min, ncores=ncores)
##         
##          Batch1 Batch2 Batch3
##   Group1    168      0    337
##   Group2    165    328      0
plotSimul(test.list[["overcorrected"]])
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

If we applied an integration method that ‘overcorrects’ (ie that removes batch effect very strongly at the expense of also removing biological signal), all cells from different batches are perfectly mixed, irrespective of the cell type .

In this situation, cms and lisi further increase, indicating better batch mixing, irrespective of the non preservation of biological variation.

Now, because cells of different type mix with each other, cluster_cms and cluster_lisi strongly increase

This example illustrates that a higher cms/lisi is not necessarily indicating a better integration. In this case, increase of cms/lisi is associated with overcorrection and loss of cell type -specific signals

plot.list <- lapply(names(test.list),function(name){
  plotSimul(test.list[[name]]) & ggtitle(name) & theme(aspect.ratio=1) | visHist(test.list[[name]],metric = c("cms"))
  #plotPCA(test, shape_by = "Group", colour_by = "Batch") + ggtitle(sprintf("cms %.2f; lisi %.2f",mean(test$cms_smooth),mean(test$isi))) | visHist(test,metric = c("cms"))
})

wrap_plots(plot.list,ncol = 2)

ggsave("testMetricsSynthethicData.png", width = 20, height = 10)
lapply(test.list,function(test){
  visOverview(test, "Batch", metric = c("cms", "isi"), other_var = "Group")    
})

Export datasets

ndim=2

exportLists <- list(batchEffect=test.list.single,batchBalance=test.list.single.balance,batchCellTypes=test.list)

for (task in names(exportLists)){
i <- exportLists[[task]]
seurat.list <- lapply(names(i), function(name){
    sce <- i[[name]]
    obj <- as.Seurat(sce, counts = "counts", data = "logcounts")
    obj <- RenameAssays(obj, originalexp = 'RNA')
    #obj <- NormalizeData(obj) %>% ScaleData() %>% RunPCA(npcs=ndim)
    obj
})
names(seurat.list) <- names(i)

saveRDS(seurat.list,paste0(task,".synthetic.seurat.rds"))

}